library(data.table)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.7
## ✔ tidyr   1.1.4     ✔ stringr 1.4.0
## ✔ readr   2.1.1     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()   masks data.table::between()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pracma)
## 
## Attaching package: 'pracma'
## The following object is masked from 'package:purrr':
## 
##     cross
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(stats)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
df <- read.csv("wolfberry.csv", row.names=NULL)

df[df$Y == 0, 1] = "LBPs"
df[df$Y == 2, 1] = "Dextran"
df[df$Y == 3, 1] = "Maltodextrin"
df[df$Y == 4, 1] = "Starch"

Step 1: PCA

There are two general methods to perform PCA in R :

The function princomp() uses the spectral decomposition approach. The functions prcomp() and PCA()[FactoMineR] use the singular value decomposition (SVD).

inclusion1 <- paste0("X", c(1:779))

inclusion2 <- paste0("X", c(1038:1870))

inclusion3 <- paste0("X", c(1142:1870))

inclusion4 <- paste0("X", c(1298:1870))
df_sub <- df[, c("C", "Y", inclusion3)]

df.active <- df_sub %>% filter(Y != 1)
df.active <- df.active %>% filter(C == 0 | C == 0.05) 

df.active_x <- df.active %>% select(-c(Y, C))
dim(df.active_x)
## [1] 200 729

PCA on the gradients of the FTIR

Source

We apply the SVD-based PCA to minimize the individual variance.

value_prime <- pracma::gradient(as.numeric(df.active_x[1, ]), h1 = 1)
plot(value_prime, type = "l", main = "Gradient", ylab = "", xlab = "")

plot(as.numeric(df.active_x[1, ]), type = "l", main = "Original", ylab = "", xlab = "")

derivative <- data.frame()

for (i in c(1:nrow(df.active_x))) {
  value_prime <- pracma::gradient(as.numeric(df.active_x[i, ]), h1 = 1)
  value_prime_prime <- pracma::gradient(value_prime, h1 = 1)
  #value_prime <- c(value_prime, value_prime_prime)
  derivative <- rbind(derivative, value_prime)
}
res.pca <- prcomp(derivative, scale = FALSE)
fviz_eig(res.pca)

3D visualization of the PCA results

https://plotly.com/r/pca-visualization/

prin_comp <- res.pca
explained_variance_ratio <- summary(prin_comp)[["importance"]]['Proportion of Variance',]
explained_variance_ratio <- 100 * explained_variance_ratio
components <- prin_comp[["x"]]
components <- data.frame(components)
components <- cbind(components, df.active$Y)
components$PC3 <- -components$PC3
components$PC2 <- -components$PC2

axis = list(showline=FALSE,
            zeroline=FALSE,
            gridcolor='#ffff',
            ticklen=4,
            titlefont=list(size=13))

fig <- components %>%
  plot_ly()  %>%
  add_trace(
    type = 'splom',
    dimensions = list(
      list(label=paste('PC 1 (',toString(round(explained_variance_ratio[1],1)),'%)',sep = ''), values=~PC1),
      list(label=paste('PC 2 (',toString(round(explained_variance_ratio[2],1)),'%)',sep = ''), values=~PC2),
      list(label=paste('PC 3 (',toString(round(explained_variance_ratio[3],1)),'%)',sep = ''), values=~PC3),
      list(label=paste('PC 4 (',toString(round(explained_variance_ratio[4],1)),'%)',sep = ''), values=~PC4)
    ),
    color = ~factor(`df.active$Y`), colors = c('#636EFA','#EF553B','#00CC96', '#F2C43D')
  ) %>%
  style(diagonal = list(visible = FALSE)) %>%
  layout(
    legend=list(title=list(text='color')),
    hovermode='closest',
    dragmode= 'select',
    plot_bgcolor='rgba(240,240,240, 0.95)',
    xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
    yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
    xaxis2=axis,
    xaxis3=axis,
    xaxis4=axis,
    yaxis2=axis,
    yaxis3=axis,
    yaxis4=axis
  )

fig
prin_comp <- res.pca

components <- prin_comp[["x"]]
components <- data.frame(components)

components$PC3 <- -components$PC3
components$PC2 <- -components$PC2
components <- cbind(components, df.active$Y)

tot_explained_variance_ratio <- summary(prin_comp)[["importance"]]['Proportion of Variance',]
tot_explained_variance_ratio <- 100 * sum(tot_explained_variance_ratio)

#tit = 'Total Explained Variance = 99.48'

fig <- plot_ly(components, x = ~PC1, y = ~PC2, z = ~PC3, color = ~factor(`df.active$Y`), colors = c('#636EFA','#EF553B','#00CC96', '#F2C43D')) %>%
  add_markers(size = 30)


fig <- fig %>%
  layout(
    scene = list(bgcolor = "#e5ecf6")
)

fig

Step 2: Classification

deriv_df.active <- cbind(derivative, Y = df.active$Y)

Looks like the data label is slightly imbalanced:

library(rsample)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(417)
df.active_wo_c <- deriv_df.active
splitted <- initial_split(data = df.active_wo_c, prop = 0.8)

table(training(splitted)$Y) %>% kable() %>% kable_styling()
Var1 Freq
Dextran 40
LBPs 44
Maltodextrin 38
Starch 38
prop.table(table(training(splitted)$Y)) %>% kable() %>% kable_styling()
Var1 Freq
Dextran 0.2500
LBPs 0.2750
Maltodextrin 0.2375
Starch 0.2375

Split into the training and testing samples, training 160 samples, testing 40 samples.

nrow(training(splitted))
nrow(testing(splitted))

Multi-class logistic, Naive Bayes and SVM.

https://mentalbreaks.rbind.io/posts/predicting-household-poverty-in-latin-america-part-ii-evaluating-multi-class-classification-models-with-caret/

train <- training(splitted)
test <- testing(splitted)

# define models to try
models <- c("multinom", "naive_bayes", "svmLinear", "pls")

# set CV control for knn, k-folds
myfolds <- createMultiFolds(train$Y, k = 5, times = 10)
control <- trainControl("repeatedcv", 
                        index = myfolds, 
                        selectionFunction = "oneSE",
                        preProcOptions = list(pcaComp = 10))

# fit models
set.seed(1)

train_models <- lapply(models, function(model){
    #print(model)
    
    if (model != "pls") {
       train(as.factor(Y) ~ ., 
             method = model, 
             data = train, 
             trControl = control, 
             metric = "Accuracy", 
             preProc = c("pca", "scale")) 
    
      } else {
       
      train(as.factor(Y) ~ ., 
            method = model, 
            data = train, 
            trControl = control, 
            metric = "Accuracy",
            preProc = c("scale"))
    }
})
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used

## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used

## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used

## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## Warning in fitFunc(X, Y, ncomp, Y.add = Y.add, center = center, ...): No convergence in 100 iterations
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
    object$times$everything["elapsed"])

# extract accuracy from CM in one step without creating a separate predictions vector
acc = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["overall"]]["Accuracy"])
  }
)

macro_sens = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(mean(cm[["byClass"]][ , "Sensitivity"], na.rm = TRUE))
  }
)

sensitivity = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["byClass"]][ , "Sensitivity"])
  }
)

macro_precision = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(mean(cm[["byClass"]][ , "Precision"], na.rm = TRUE))
  }
)

precision = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["byClass"]][ , "Precision"])
  }
)

# extract F1 by class
F1 = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["byClass"]][ , "F1"])
  }
)

# extract macro F1
F1_M = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(mean(cm[["byClass"]][ , "F1"], na.rm = TRUE))
  }
)
library(kableExtra)

acc %>% kable() %>% kable_styling()
x
multinom.Accuracy 1.000
naive_bayes.Accuracy 0.875
svmLinear.Accuracy 1.000
pls.Accuracy 0.625
sensitivity %>% kable() %>% kable_styling()
multinom naive_bayes svmLinear pls
Class: Dextran 1 0.9000000 1 1.00
Class: LBPs 1 1.0000000 1 1.00
Class: Maltodextrin 1 0.9166667 1 0.25
Class: Starch 1 0.7500000 1 0.50
macro_sens %>% kable() %>% kable_styling()
x
multinom 1.0000000
naive_bayes 0.8916667
svmLinear 1.0000000
pls 0.6875000
precision %>% kable() %>% kable_styling()
multinom naive_bayes svmLinear pls
Class: Dextran 1 1.0 1 0.9090909
Class: LBPs 1 0.6 1 0.5454545
Class: Maltodextrin 1 1.0 1 0.5000000
Class: Starch 1 0.9 1 0.5000000
macro_precision  %>% kable() %>% kable_styling()
x
multinom 1.0000000
naive_bayes 0.8750000
svmLinear 1.0000000
pls 0.6136364
F1 %>% kable() %>% kable_styling()
multinom naive_bayes svmLinear pls
Class: Dextran 1 0.9473684 1 0.9523810
Class: LBPs 1 0.7500000 1 0.7058824
Class: Maltodextrin 1 0.9565217 1 0.3333333
Class: Starch 1 0.8181818 1 0.5000000
F1_M %>% kable() %>% kable_styling()
x
multinom 1.0000000
naive_bayes 0.8680180
svmLinear 1.0000000
pls 0.6228992
#predict(mod1, newdata = test, type = "prob")

(Optional) Visualize the derivative

plot(as.numeric(train[1, 1:ncol(train)-1]), type = "l", main = train[1, "Y"])

plot(as.numeric(train[24, 1:ncol(train)-1]), type = "l", main = train[24, "Y"])

plot(as.numeric(train[2, 1:ncol(train)-1]), type = "l", main = train[2, "Y"])

plot(as.numeric(train[157, 1:ncol(train)-1]), type = "l", main = train[157, "Y"])

Use the rest of the data as the test set

df_sub <- df[, c("C", "Y", inclusion3)]

df.active <- df_sub %>% filter(Y != 1)
df.active <- df.active %>% filter(C != 0.05) 

df.active_x <- df.active %>% select(-c(Y, C))
dim(df.active_x)
## [1] 800 729
deriv_df.active <- cbind(derivative, Y = df.active$Y)

derivative <- data.frame()

for (i in c(1:nrow(df.active_x))) {
  value_prime <- pracma::gradient(as.numeric(df.active_x[i, ]), h1 = 1)
  value_prime_prime <- pracma::gradient(value_prime, h1 = 1)
  #value_prime <- c(value_prime, value_prime_prime)
  derivative <- rbind(derivative, value_prime)
}

test <- cbind(derivative, Y = df.active$Y)
table(test$Y) %>% kable() %>% kable_styling()
Var1 Freq
Dextran 250
LBPs 50
Maltodextrin 250
Starch 250
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
    object$times$everything["elapsed"])

# extract accuracy from CM in one step without creating a separate predictions vector
acc = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["overall"]]["Accuracy"])
  }
)

macro_sens = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(mean(cm[["byClass"]][ , "Sensitivity"], na.rm = TRUE))
  }
)

sensitivity = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["byClass"]][ , "Sensitivity"])
  }
)

macro_precision = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(mean(cm[["byClass"]][ , "Precision"], na.rm = TRUE))
  }
)

precision = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["byClass"]][ , "Precision"])
  }
)

# extract F1 by class
F1 = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(cm[["byClass"]][ , "F1"])
  }
)

# extract macro F1
F1_M = sapply(train_models, function(x){
    pred = predict(x, newdata = test)
    cm = confusionMatrix(pred, reference = as.factor(test$Y))
    return(mean(cm[["byClass"]][ , "F1"], na.rm = TRUE))
  }
)
acc %>% kable() %>% kable_styling()
x
multinom.Accuracy 0.98125
naive_bayes.Accuracy 0.39250
svmLinear.Accuracy 0.95500
pls.Accuracy 0.53500
sensitivity %>% kable() %>% kable_styling()
multinom naive_bayes svmLinear pls
Class: Dextran 0.944 0.288 1.000 0.912
Class: LBPs 1.000 1.000 1.000 1.000
Class: Maltodextrin 1.000 0.304 1.000 0.008
Class: Starch 0.996 0.464 0.856 0.592
macro_sens %>% kable() %>% kable_styling()
x
multinom 0.985
naive_bayes 0.514
svmLinear 0.964
pls 0.628
precision %>% kable() %>% kable_styling()
multinom naive_bayes svmLinear pls
Class: Dextran 0.9957806 0.2345277 0.8771930 0.7261146
Class: LBPs 1.0000000 0.4761905 0.9803922 0.9433962
Class: Maltodextrin 0.9469697 0.5277778 1.0000000 0.0952381
Class: Starch 1.0000000 0.4754098 1.0000000 0.3592233
macro_precision  %>% kable() %>% kable_styling()
x
multinom 0.9856876
naive_bayes 0.4284764
svmLinear 0.9643963
pls 0.5309931
F1 %>% kable() %>% kable_styling()
multinom naive_bayes svmLinear pls
Class: Dextran 0.9691992 0.2585278 0.9345794 0.8085106
Class: LBPs 1.0000000 0.6451613 0.9900990 0.9708738
Class: Maltodextrin 0.9727626 0.3857868 1.0000000 0.0147601
Class: Starch 0.9979960 0.4696356 0.9224138 0.4471299
F1_M %>% kable() %>% kable_styling()
x
multinom 0.9849895
naive_bayes 0.4397779
svmLinear 0.9617731
pls 0.5603186

Regression

Dextran

set.seed(417)
df.active_dex <- df %>% filter(Y == "Dextran") %>% select(-Y)
splitted <- initial_split(data = df.active_dex, prop = 0.8)
train <- training(splitted)
test <- testing(splitted)
table(train$C) %>% kable() %>% kable_styling()
Var1 Freq
0.05 42
0.1 37
0.3 42
0.5 41
0.7 39
1 39

https://topepo.github.io/caret/model-training-and-tuning.html

# define models to try
models <- c("lm", "lasso", "ranger", "pls")

# set CV control for knn, k-folds
myfolds <- createMultiFolds(train$C, k = 5, times = 10)
control <- trainControl("repeatedcv", 
                        index = myfolds, 
                        selectionFunction = "oneSE",
                        preProcOptions = list(pcaComp = 10))

# fit models
set.seed(1)

train_models <- lapply(models, function(model){
    
    if (model != "pls") {
       train(C ~ ., 
             method = model, 
             data = train, 
             trControl = control, 
             metric = "RMSE", 
             preProc = c("pca", "scale")) 
    
      } else {
       
      train(C ~ ., 
            method = model, 
            data = train, 
            trControl = control, 
            metric = "RMSE",
            preProc = c("scale"))
    }
})
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used

## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used

## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used

## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
    object$times$everything["elapsed"])

# extract accuracy from CM in one step without creating a separate predictions vector
rmse = sapply(train_models, function(x){
    pred = predict(x, newdata = test %>% select(-C))
    cm = postResample(pred = pred, obs = test$C)
  }
)
rmse
##                  lm      lasso     ranger        pls
## RMSE     0.07738918 0.08193581 0.08707198 0.08829405
## Rsquared 0.94939923 0.94409055 0.95698336 0.93442858
## MAE      0.04865527 0.05239330 0.06825042 0.04717642
library(ggplot2)
pred <- predict(train_models, newdata = test)$lm
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.052008, df = 117.93, p-value = 0.9586
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1270967  0.1205917
## sample estimates:
## mean of x mean of y 
## 0.4517475 0.4550000
g1 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Dextran - Linear regression")

pred <- predict(train_models, newdata = test)$lasso
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.10546, df = 117.67, p-value = 0.9162
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1286114  0.1156053
## sample estimates:
## mean of x mean of y 
##  0.448497  0.455000
g2 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Dextran - LASSO")

pred <- predict(train_models, newdata = test)$ranger
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.053243, df = 114.27, p-value = 0.9576
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1184748  0.1122729
## sample estimates:
## mean of x mean of y 
##  0.451899  0.455000
g3 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Dextran - Random forest")

pred <- predict(train_models, newdata = test)$pls
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.09793, df = 117.93, p-value = 0.9222
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1299850  0.1177347
## sample estimates:
## mean of x mean of y 
## 0.4488748 0.4550000
g4 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Dextran - Partial least square")

ggpubr::ggarrange(g1, g2, g3, g4)

Maltodextrin

set.seed(417)
df.active_dex <- df %>% filter(Y == "Maltodextrin") %>% select(-Y)
splitted <- initial_split(data = df.active_dex, prop = 0.8)

train <- training(splitted)
test <- testing(splitted)
# define models to try
models <- c("lm", "lasso", "ranger", "pls")

# set CV control for k-folds
myfolds <- createMultiFolds(train$C, k = 5, times = 10)
control <- trainControl("repeatedcv", 
                        index = myfolds, 
                        selectionFunction = "oneSE",
                        preProcOptions = list(pcaComp = 10))

# fit models
set.seed(1)

train_models <- lapply(models, function(model){
    print(model)
    
    if (model != "pls") {
       train(C ~ ., 
             method = model, 
             data = train, 
             trControl = control, 
             metric = "RMSE", 
             preProc = c("pca", "scale")) 
    
      } else {
       
      train(C ~ ., 
            method = model, 
            data = train, 
            trControl = control, 
            metric = "RMSE",
            preProc = c("scale"))
    }
})
## [1] "lm"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "lasso"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "ranger"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "pls"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
    object$times$everything["elapsed"])

# extract accuracy from CM in one step without creating a separate predictions vector
rmse = sapply(train_models, function(x){
    pred = predict(x, newdata = test %>% select(-C))
    cm = postResample(pred = pred, obs = test$C)
  }
)
rmse
##                  lm      lasso     ranger        pls
## RMSE     0.07403046 0.07881876 0.06172400 0.08271762
## Rsquared 0.95460759 0.95005914 0.97455226 0.94240234
## MAE      0.05911742 0.06410679 0.04666944 0.06379732
pred <- predict(train_models, newdata = test)$lm
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.16643, df = 117.87, p-value = 0.8681
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1336747  0.1129472
## sample estimates:
## mean of x mean of y 
## 0.4446363 0.4550000
g1 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Maltodextrin - Linear regression")

pred <- predict(train_models, newdata = test)$lasso
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.14762, df = 117.39, p-value = 0.8829
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1300717  0.1120258
## sample estimates:
## mean of x mean of y 
## 0.4459771 0.4550000
g2 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Maltodextrin - LASSO")

pred <- predict(train_models, newdata = test)$ranger
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = 0.050847, df = 116.85, p-value = 0.9595
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1164702  0.1226083
## sample estimates:
## mean of x mean of y 
## 0.4580691 0.4550000
g3 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Maltodextrin - Random forest")

pred <- predict(train_models, newdata = test)$pls
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.074878, df = 117.8, p-value = 0.9404
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1274305  0.1181450
## sample estimates:
## mean of x mean of y 
## 0.4503573 0.4550000
g4 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Maltodextrin - Partial least square")

ggpubr::ggarrange(g1, g2, g3, g4)

set.seed(417)
df.active_dex <- df %>% filter(Y == "Starch") %>% select(-Y)
splitted <- initial_split(data = df.active_dex, prop = 0.8)

train <- training(splitted)
test <- testing(splitted)
# define models to try
models <- c("lm", "lasso", "ranger",  "pls")

# set CV control for knn, k-folds
myfolds <- createMultiFolds(train$C, k = 5, times = 10)
control <- trainControl("repeatedcv", 
                        index = myfolds, 
                        selectionFunction = "oneSE",
                        preProcOptions = list(pcaComp = 10))

# fit models
set.seed(1)

train_models <- lapply(models, function(model){
    print(model)
    
    if (model != "pls") {
       train(C ~ ., 
             method = model, 
             data = train, 
             trControl = control, 
             metric = "RMSE", 
             preProc = c("pca", "scale")) 
    
      } else {
       
      train(C ~ ., 
            method = model, 
            data = train, 
            trControl = control, 
            metric = "RMSE",
            preProc = c("scale"))
    }
})
## [1] "lm"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "lasso"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## [1] "ranger"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## 
## [1] "pls"
## Warning in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
## non-uniform 'Rounding' sampler used
names(train_models) <- models
# extract elapsed training times
elapsed <- sapply(train_models, function(object)
    object$times$everything["elapsed"])

# extract accuracy from CM in one step without creating a separate predictions vector
rmse = sapply(train_models, function(x){
    pred = predict(x, newdata = test %>% select(-C))
    cm = postResample(pred = pred, obs = test$C)
  }
)
rmse
##                  lm      lasso     ranger       pls
## RMSE     0.07880187 0.07584639 0.09978973 0.1605508
## Rsquared 0.95029595 0.95318785 0.93793593 0.8095465
## MAE      0.06049102 0.05653805 0.06107886 0.1348417
pred <- predict(train_models, newdata = test)$lm
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.21788, df = 117.99, p-value = 0.8279
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1397999  0.1120859
## sample estimates:
## mean of x mean of y 
##  0.441143  0.455000
g1 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Starch - Linear regression")

pred <- predict(train_models, newdata = test)$lasso
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.20063, df = 117.72, p-value = 0.8413
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1347624  0.1099678
## sample estimates:
## mean of x mean of y 
## 0.4426027 0.4550000
g2 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Starch - LASSO")

pred <- predict(train_models, newdata = test)$ranger
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = -0.45311, df = 114.62, p-value = 0.6513
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.14230377  0.08932138
## sample estimates:
## mean of x mean of y 
## 0.4285088 0.4550000
g3 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Starch - Random forest")

pred <- predict(train_models, newdata = test)$pls
plot_table <- data.frame(pred = pred, test = test$C) 
t.test(plot_table$pred, plot_table$test)
## 
##  Welch Two Sample t-test
## 
## data:  plot_table$pred and plot_table$test
## t = 0.059375, df = 108.23, p-value = 0.9528
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1067038  0.1132939
## sample estimates:
## mean of x mean of y 
##  0.458295  0.455000
g4 <- plot_table %>% 
  ggplot() +
  geom_line(aes(x = seq(0, 1, length.out=60), y = seq(0, 1, length.out=60))) +
  geom_point(aes(x = pred, y = test), size = 0.5) + theme_bw() + 
  xlab("Predicted concentration") + 
  ylab("True concentration") + 
  ggtitle("Starch - Partial least square")

ggpubr::ggarrange(g1, g2, g3, g4)